Layernormgrad
计算 Layer Normalization 操作的梯度。该算子是 Layer Normalization 的反向传播部分,用于计算损失函数相对于输入 x、以及可学习参数 gamma 和 beta 的梯度。
\[ \begin{align}\begin{aligned}\text{dg}_i = \sum_{j} \text{dy}_j \cdot \frac{x_j - \mu}{\sqrt{\sigma^2 + \epsilon}}\\\text{db}_i = \sum_{j} \text{dy}_j\\\text{dx}_i = f(\text{dy}, x, \gamma, \mu, \sigma^2)\end{aligned}\end{align} \]
其中 :math:mu 是均值,:math:sigma^2 是方差,:math:epsilon 是一个为了防止除零而添加的极小值。dx 的计算较为复杂,它依赖于 dy、x 和 gamma。
- 输入:
x - 前向传播时的输入数据地址。
dy - 后续层反向传播回来的梯度数据地址。
- params - 参数打包成数组:
var - 前向传播时计算出的方差(variance)地址。
mean - 前向传播时计算出的均值(mean)地址。
gamma - 前向传播时使用的可学习缩放参数 :math:gamma 地址。
dg - 输出,计算出的关于 gamma 的梯度地址。
db - 输出,计算出的关于 beta 的梯度地址。
param_num - 特征维度的大小,也是 gamma 和 beta 的大小。
param_size - 进行独立归一化的单元数量(例如批处理大小 Batch Size)。
block_num - 块的数量(通常等于 param_size)。
block_size - 每个块的大小(通常等于 param_num)。
core_mask - 核掩码(仅共享存储版本需要)。
- 输出:
dx - 计算出的关于输入 x 的梯度地址。
dg - 计算出的关于参数 gamma 的梯度地址。
db - 计算出的关于参数 beta 的梯度地址。
- 支持平台:
FT78NEMT7004
备注
FT78NE 支持fp32
MT7004 支持fp16, fp32
共享存储版本:
-
void hp_layer_norm_grad_s(half *x, half *dy, half *dx, long long *params, int core_mask)
-
void fp_layer_norm_grad_s(float *x, float *dy, float *dx, long long *params, int core_mask)
C调用示例:
1//FT78NE示例
2#include <stdio.h>
3#include <layernormgrad.h> // 假设头文件名为 layernormgrad.h
4
5int main(int argc, char* argv[]) {
6 float *x = (float *)0x81000000;
7 float *dy = (float *)0x82000000;
8 float *var = (float *)0x83000000;
9 float *mean = (float *)0x84000000;
10 float *gamma = (float *)0x85000000;
11
12 int param_num = 8;
13 int param_size = 128;
14 int block_num = param_size;
15 int block_size = param_num;
16
17 float *dx = (float *)0x86000000;
18 float *dg = (float *)0x87000000;
19 float *db = (float *)0x88000000;
20 float *check_dx = (float *)0x89000000;
21 float *check_dg = (float *)0x8A000000;
22 float *check_db = (float *)0x8B000000;
23
24 int i = 0;
25
26 srand(seed++);
27
28 float f_min = 1.0;
29 float f_max = 2.0;
30
31 for(i = 0; i < param_num * param_size; ++i) {
32 x[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
33 dy[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
34 }
35
36 for(i = 0; i < block_num; i ++) {
37 var[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
38 mean[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
39 }
40
41 for(i = 0; i < param_num; i ++) {
42 gamma[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
43 }
44
45 long long params[12];
46 params[0] = (long long)var;
47 params[1] = (long long)mean;
48 params[2] = (long long)gamma;
49 params[3] = (long long)dg;
50 params[4] = (long long)db;
51 params[5] = (long long)param_num;
52 params[6] = (long long)param_size;
53 params[7] = (long long)block_num;
54 params[8] = (long long)block_size;
55
56 int core_mask = 0b1111;
57 fp_layer_norm_grad_s(x, dy, dx, (long long *)params, core_mask);//调用汇编
58 return 0;
59}
私有存储版本:
-
void hp_layer_norm_grad_p(half *x, half *dy, half *dx, long long *params)
-
void fp_layer_norm_grad_p(float *x, float *dy, float *dx, long long *params)
C调用示例:
1//FT78NE示例
2#include <stdio.h>
3#include <layernormgrad.h> // 假设头文件名为 layernormgrad.h
4
5int main(int argc, char* argv[]) {
6 float *x = (float *)0x10010000;
7 float *dy = (float *)0x10020000;
8 float *var = (float *)0x10030000;
9 float *mean = (float *)0x10040000;
10 float *gamma = (float *)0x10050000;
11
12 int param_num = 8;
13 int param_size = 128;
14 int block_num = param_size;
15 int block_size = param_num;
16
17 float *dx = (float *)0x10016000;
18 float *dg = (float *)0x10026000;
19 float *db = (float *)0x10036000;
20 float *check_dx = (float *)0x10045000;
21 float *check_dg = (float *)0x10055000;
22 float *check_db = (float *)0x10060000;
23
24 int i = 0;
25
26 srand(seed++);
27
28 float f_min = 1.0;
29 float f_max = 2.0;
30
31 for(i = 0; i < param_num * param_size; ++i) {
32 x[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
33 dy[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
34 }
35
36 for(i = 0; i < block_num; i ++) {
37 var[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
38 mean[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
39 }
40
41 for(i = 0; i < param_num; i ++) {
42 gamma[i] = f_min + ((float)rand() / (float)RAND_MAX) * (f_max - f_min);
43 }
44
45 long long params[12];
46 params[0] = (long long)var;
47 params[1] = (long long)mean;
48 params[2] = (long long)gamma;
49 params[3] = (long long)dg;
50 params[4] = (long long)db;
51 params[5] = (long long)param_num;
52 params[6] = (long long)param_size;
53 params[7] = (long long)block_num;
54 params[8] = (long long)block_size;
55
56 fp_layer_norm_grad_p(x, dy, dx, (long long *)params);//调用汇编
57 return 0;
58}